In [138]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.svm import SVC
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from imblearn.over_sampling import SMOTE
# from sklearn.model_selection import GridSearchCV
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [139]:
df = pd.read_csv("CVD_cleaned.csv")
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  object 
 1   Checkup                       308854 non-null  object 
 2   Exercise                      308854 non-null  object 
 3   Heart_Disease                 308854 non-null  object 
 4   Skin_Cancer                   308854 non-null  object 
 5   Other_Cancer                  308854 non-null  object 
 6   Depression                    308854 non-null  object 
 7   Diabetes                      308854 non-null  object 
 8   Arthritis                     308854 non-null  object 
 9   Sex                           308854 non-null  object 
 10  Age_Category                  308854 non-null  object 
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

In [141]:
df.columns

Index(['General_Health', 'Checkup', 'Exercise', 'Heart_Disease', 'Skin_Cancer',
       'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 'Sex',
       'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI', 'Smoking_History',
       'Alcohol_Consumption', 'Fruit_Consumption',
       'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],
      dtype='object')

In [142]:
df.drop(columns=['Skin_Cancer','Other_Cancer'],inplace = True)

In [143]:
df['Diabetes'] = df['Diabetes'].replace({
    'Yes': 'Yes',
    'Yes, but female told only during pregnancy': 'Yes',
    'No': 'No',
    'No, pre-diabetes or borderline diabetes': 'No'
})
df['Diabetes'].value_counts()

Diabetes
No     266037
Yes     42817
Name: count, dtype: int64

In [144]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
numerical_columns = df.select_dtypes(include=['number'])

In [145]:
bins = [-float('inf'), 18.5, 24.9, 29.9, float('inf')]
labels = ['Underweight', 'Normal weight', 'Overweight', 'Obesity']

# Create a new column 'obesity' based on BMI classification
df['obesity'] = pd.cut(df['BMI'], bins=bins, labels=labels)
df['obesity'].value_counts()

obesity
Overweight       109866
Obesity          106738
Normal weight     87706
Underweight        4544
Name: count, dtype: int64

In [146]:
label_encoders = {}

# Apply Label Encoding to categorical columns
for column in categorical_columns:
    df[column] = df[column].astype(str)
    label_encoder = LabelEncoder() 
    df[column] = label_encoder.fit_transform(df[column])
    label_encoders[column] = label_encoder 


In [147]:
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [148]:
if 'Heart_Disease' in label_encoders:
    encoder = label_encoders['Heart_Disease']
    mappings = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))
    print("Checkup Label Encoder Mappings:")
    for key, value in mappings.items():
        print(f"{key}: {value}")
else:
    print("Checkup column not found in label encoders.")

Checkup Label Encoder Mappings:
0: No
1: Yes


In [149]:
len(df.columns.tolist())

18

In [150]:
df['Diabetes'].value_counts()

Diabetes
0    266037
1     42817
Name: count, dtype: int64

In [151]:
y_heart_disease = df['Heart_Disease']
y_diabetes = df['Diabetes']
y_obesity = df['obesity']
X_train = df.drop(columns=['Heart_Disease','Diabetes','obesity'])


In [152]:
# Feature scaling (Standardizing the data)
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
X_train_ht, X_test_ht, y_train_ht, y_test_ht = train_test_split(X_train,y_heart_disease,test_size= 0.2)
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X_train,y_diabetes,test_size= 0.2)
X_train_ob, X_test_ob, y_train_ob, y_test_ob = train_test_split(X_train,y_obesity,test_size= 0.2)

In [154]:
X_train_dt.columns

Index(['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',
       'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',
       'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',
       'Green_Vegetables_Consumption', 'FriedPotato_Consumption'],
      dtype='object')

In [155]:
### Logistic Regression
lr_ht = LogisticRegression()
lr_ht.fit(X_train_ht,y_train_ht)
y_pred_ht = lr_ht.predict(X_test_ht)
print(f"Accuracy: {accuracy_score(y_test_ht, y_pred_ht):.4f}")
print(classification_report(y_test_ht, y_pred_ht))

Accuracy: 0.9201
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56850
           1       0.33      0.00      0.01      4921

    accuracy                           0.92     61771
   macro avg       0.63      0.50      0.48     61771
weighted avg       0.87      0.92      0.88     61771



In [156]:
### Logistic Regression
lr_dt = LogisticRegression()
lr_dt.fit(X_train_dt,y_train_dt)
y_pred_dt = lr_dt.predict(X_test_dt)
print(f"Accuracy: {accuracy_score(y_test_dt, y_pred_dt):.4f}")
print(classification_report(y_test_dt, y_pred_dt))

Accuracy: 0.8609
              precision    recall  f1-score   support

           0       0.86      0.99      0.92     53194
           1       0.49      0.04      0.07      8577

    accuracy                           0.86     61771
   macro avg       0.68      0.52      0.50     61771
weighted avg       0.81      0.86      0.81     61771



In [157]:
### Logistic Regression
lr_ob = LogisticRegression()
lr_ob.fit(X_train_ob,y_train_ob)
y_pred_ob = lr_ob.predict(X_test_ob)
print(f"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}")
print(classification_report(y_test_ob, y_pred_ob))

Accuracy: 0.9732
               precision    recall  f1-score   support

Normal weight       0.97      0.97      0.97     17380
      Obesity       0.99      0.99      0.99     21321
   Overweight       0.98      0.98      0.98     22138
  Underweight       0.63      0.65      0.64       932

     accuracy                           0.97     61771
    macro avg       0.89      0.89      0.89     61771
 weighted avg       0.97      0.97      0.97     61771



In [158]:
### Logistic Regression
lr_ob = LogisticRegression()
lr_ob.fit(X_train_ob,y_train_ob)
y_pred_ob = lr_ob.predict(X_test_ob)
print(f"Accuracy: {accuracy_score(y_test_ob, y_pred_ob):.4f}")
print(classification_report(y_test_ob, y_pred_ob))

Accuracy: 0.9732
               precision    recall  f1-score   support

Normal weight       0.97      0.97      0.97     17380
      Obesity       0.99      0.99      0.99     21321
   Overweight       0.98      0.98      0.98     22138
  Underweight       0.63      0.65      0.64       932

     accuracy                           0.97     61771
    macro avg       0.89      0.89      0.89     61771
 weighted avg       0.97      0.97      0.97     61771



In [159]:
model_filename = 'lr_ht.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(lr_ht, file)

print(f"Model saved to {model_filename}")

Model saved to lr_ht.pkl


In [160]:
model_filename = 'lr_dt.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(lr_dt, file)

print(f"Model saved to {model_filename}")

Model saved to lr_dt.pkl


In [161]:
model_filename = 'lr_ob.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(lr_ob, file)

print(f"Model saved to {model_filename}")

Model saved to lr_ob.pkl


In [112]:
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Now you can use loaded_model to make predictions
y_pred_loaded = loaded_model.predict(X_test_ob)
print(f"Loaded Model Accuracy: {accuracy_score(y_test_ob, y_pred_loaded):.4f}")

Loaded Model Accuracy: 0.9727


In [113]:
X_test_ob.columns

Index(['General_Health', 'Checkup', 'Exercise', 'Skin_Cancer', 'Other_Cancer',
       'Depression', 'Arthritis', 'Sex', 'Age_Category', 'Height_(cm)',
       'Weight_(kg)', 'BMI', 'Smoking_History', 'Alcohol_Consumption',
       'Fruit_Consumption', 'Green_Vegetables_Consumption',
       'FriedPotato_Consumption'],
      dtype='object')

In [114]:
categorical_columns.tolist()

['General_Health',
 'Checkup',
 'Exercise',
 'Heart_Disease',
 'Skin_Cancer',
 'Other_Cancer',
 'Depression',
 'Diabetes',
 'Arthritis',
 'Sex',
 'Age_Category',
 'Smoking_History']

In [115]:
# Define the columns
columns = ['General_Health', 'Checkup', 'Exercise', 'Depression', 'Arthritis',
           'Sex', 'Age_Category', 'Height_(cm)', 'Weight_(kg)', 'BMI',
           'Smoking_History', 'Alcohol_Consumption', 'Fruit_Consumption',
           'Green_Vegetables_Consumption', 'FriedPotato_Consumption']

# Define the input values for each column (replace these with actual values)
input_values = {
    'General_Health': 'Poor',  # Example values
    'Checkup': 'Within the past 2 years',
    'Exercise': 'No',
    'Depression': 'No',
    'Arthritis': 'Yes',
    'Sex': 'Female',
    'Age_Category': '70-74',
    'Height_(cm)': 150,  # Example numerical values
    'Weight_(kg)': 32.66,
    'BMI': 14.34,
    'Smoking_History': 'Yes',
    'Alcohol_Consumption': 0.0,
    'Fruit_Consumption': 30.0,
    'Green_Vegetables_Consumption': 16,
    'FriedPotato_Consumption': 12
}

# Create a DataFrame from input values
input_df = pd.DataFrame([input_values])

# Encode categorical columns using the same LabelEncoders you used during training
for column in categorical_columns:
    if column in input_df.columns:
        # Transform the input values using the stored encoder
        input_df[column] = label_encoders[column].transform(input_df[column].astype(str))

# Display the input DataFrame after encoding
print("Encoded Input DataFrame:")
input_df


Encoded Input DataFrame:


Unnamed: 0,General_Health,Checkup,Exercise,Depression,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,3,2,0,0,1,0,10,150,32.66,14.34,1,0.0,30.0,16,12


In [116]:
# Prepare input for prediction (make sure to drop any non-feature columns if necessary)
# X_input = input_df[columns]

# Make the prediction
prediction = loaded_model.predict(input_df)

# Output the prediction
print(f"Predicted Class: {prediction[0]}")

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Other_Cancer
- Skin_Cancer


In [None]:
prediction

array(['Underweight'], dtype=object)

In [None]:
model_filename = 'lr_dt.pkl'

with open(model_filename, 'rb') as file:
    loaded_model_lr_dt = pickle.load(file)
y_pred_loaded = loaded_model_lr_dt.predict_proba(input_df)
y_pred_loaded

array([[0.90208773, 0.01696106, 0.06855007, 0.01240114]])

In [None]:
df